import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.impute import KNNImputer
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.preprocessing import MinMaxScaler
import warnings
from pygam import LinearGAM, s, te
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
original_dat = pd.read_csv('single_housing.csv')
original_dat
| Street | City | State | Zip | Price | SqFt | Acreage | Beds | Baths | geoadd | Latitude | Longitude | CheckAddDuplicate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 103 Meadow Side Ct | Carrollton | GA | 30116 | 2249 | 1436.0 | 0.42 | 3.0 | 2.0 | 103 Meadow Side Ct, Carrollton, GA 30116 | 33.568164 | -85.040620 | False |
| 1 | 39 Mountain View Dr | Rockmart | GA | 30153 | 2675 | 3107.0 | 0.36 | 4.0 | 3.0 | 39 Mountain View Dr, Rockmart, GA 30153 | 34.009622 | -85.017519 | False |
| 2 | 32 Mountain View Dr | Rockmart | GA | 30153 | 2099 | 1276.0 | 0.35 | 3.0 | 2.0 | 32 Mountain View Dr, Rockmart, GA 30153 | 34.008885 | -85.018616 | False |
| 3 | 381 Wingfoot St | Rockmart | GA | 30153 | 2190 | 1447.0 | 0.33 | 3.0 | 2.0 | 381 Wingfoot St, Rockmart, GA 30153 | 34.004158 | -85.033841 | False |
| 4 | 30 Standing Rock Rd | Senoia | GA | 30276 | 2200 | 1728.0 | 1.03 | 3.0 | 2.0 | 30 Standing Rock Rd, Senoia, GA 30276 | 33.305696 | -84.558150 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6106 | 6260 New Mill Trl | Acworth | GA | 30102 | 1975 | 1300.0 | 0.69 | 3.0 | 2.0 | 6260 New Mill Trl, Acworth, GA 30102 | 34.107751 | -84.627044 | False |
| 6107 | 115 Jameson Dr | Acworth | GA | 30102 | 2750 | 2113.0 | 0.00 | 3.0 | 2.5 | 115 Jameson Dr, Acworth, GA 30102 | 34.114253 | -84.637448 | False |
| 6108 | 6340 McCollum Ln | Acworth | GA | 30102 | 5750 | 1480.0 | 1.00 | 4.0 | 2.0 | 6340 McCollum Ln, Acworth, GA 30102 | 34.104797 | -84.638603 | False |
| 6109 | 19 Moonlight Dr | Acworth | GA | 30102 | 5200 | 1237.0 | 1.00 | 3.0 | 2.5 | 19 Moonlight Dr, Acworth, GA 30102 | 34.111894 | -84.659457 | False |
| 6110 | 122 Highland View Pass | White | GA | 30184 | 2475 | 2468.0 | 2.00 | 5.0 | 3.0 | 122 Highland View Pass, White, GA 30184 | 34.214803 | -84.631582 | False |
6111 rows × 13 columns
#create duplicate dataset
dat = original_dat
sns.boxplot(data=dat, x='Price')
<AxesSubplot:xlabel='Price'>
#remove outliers
percentile25 = dat['Price'].quantile(0.25)
percentile75 = dat['Price'].quantile(0.75)
IQR = percentile75 - percentile25
dat = dat.loc[(dat['Price']>percentile25-1.5*IQR) & (dat['Price']<percentile75+1.5*IQR),]
#number of rows removed
len(original_dat) - len(dat)
527
#remove non-numeric columns
dat = dat.drop(['Street','City','State','Zip','geoadd','CheckAddDuplicate'], axis=1)
dat.describe()
| Price | SqFt | Acreage | Beds | Baths | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|
| count | 5584.000000 | 5436.000000 | 5584.000000 | 5564.000000 | 5583.000000 | 5584.000000 | 5584.000000 |
| mean | 2137.112643 | 1782.698389 | 0.912957 | 3.192128 | 2.254164 | 33.773794 | -84.369918 |
| std | 444.120790 | 892.503206 | 22.728031 | 0.852781 | 0.628198 | 0.215588 | 0.286286 |
| min | 906.000000 | 100.000000 | 0.000000 | 1.000000 | 0.500000 | 33.294761 | -85.048940 |
| 25% | 1845.000000 | 1346.750000 | 0.000000 | 3.000000 | 2.000000 | 33.606029 | -84.570788 |
| 50% | 2099.000000 | 1686.500000 | 0.000000 | 3.000000 | 2.000000 | 33.772935 | -84.364104 |
| 75% | 2399.250000 | 2126.500000 | 0.350000 | 4.000000 | 2.500000 | 33.925092 | -84.153509 |
| max | 3450.000000 | 49299.440000 | 1332.000000 | 8.000000 | 8.000000 | 34.365909 | -83.782030 |
dat.isna().sum()
Price 0 SqFt 148 Acreage 0 Beds 20 Baths 1 Latitude 0 Longitude 0 dtype: int64
sns.histplot(data=dat, x='Price', bins=10)
<AxesSubplot:xlabel='Price', ylabel='Count'>
#dataset explainations
#original_dat (uncleaned dataset)
#dat (cleaned dataset with NA values)
#the response variable does not have NAs
y = dat['Price'].values
X = dat.iloc[: , 1:]
#X_simple (filled NAs with column means)
#X_knn (filled NAs with KNNImputer)
#X_forest (filled NAs with MissForest)
X_simple = X.fillna(X.mean()).values
gam_reg = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)).fit(X_simple, y)
cv_score = sqrt(gam_reg.statistics_['GCV'])
cv_score
306.66644690422464
#a different approach to NA
k = list(range(5,100,5)) + list(range(100,501,50))
outsample = []
for i in range(len(k)):
#create duplicate predictor dataset with missing values
X_temp = X
#build the model
imputer = KNNImputer(n_neighbors=k[i])
X_temp = pd.DataFrame(imputer.fit_transform(X_temp)).values
#evaluate on gam model
gam_reg = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)).fit(X_temp, y)
cv_score = sqrt(gam_reg.statistics_['GCV'])
outsample.append(cv_score)
knn_table = pd.DataFrame({'k':k, 'out-of-sample error':outsample})
sns.lineplot(data=knn_table, x='k', y='out-of-sample error')
<AxesSubplot:xlabel='k', ylabel='out-of-sample error'>
#select optimal k to finalize the model
imputer = KNNImputer(n_neighbors=100)
X_knn = pd.DataFrame(imputer.fit_transform(X), columns=list(X.columns))
imputer = MissForest()
X_forest = pd.DataFrame(imputer.fit_transform(X), columns=list(X.columns))
Iteration: 0 Iteration: 1 Iteration: 2 Iteration: 3
gam_reg = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)).fit(X_forest, y)
cv_score = sqrt(gam_reg.statistics_['GCV'])
cv_score
299.69355898363153
#prep the data
X = X_forest[['SqFt','Acreage','Beds','Baths']].values
# X = X_knn[['Latitude','Longitude']].values
# X = X_knn.values
#tune the model
outsample = []
k = list(range(1,20)) + list(range(20,100,5)) + list(range(100,501,100))
for i in range(len(k)):
knn_model = KNeighborsRegressor(n_neighbors=k[i])
cv_score = cross_validate(knn_model, X, y, cv=10, scoring='neg_mean_squared_error')['test_score']
outsample.append(np.mean(np.sqrt(-cv_score)))
knn_table = pd.DataFrame({'k':k, 'out-of-sample error':outsample})
sns.lineplot(data=knn_table, x='k', y='out-of-sample error')
<AxesSubplot:xlabel='k', ylabel='out-of-sample error'>
optimalk = k[outsample.index(min(outsample))]
print('The best k for out-of-sample prediction: ' + str(optimalk))
print('The best cv out-of-sample error: ' + str(round(min(outsample),3)))
The best k for out-of-sample prediction: 200 The best cv out-of-sample error: 337.189
scaler = MinMaxScaler()
scaler.fit(X_forest)
X = scaler.transform(X_forest)
pd.DataFrame(X, columns=list(X_forest.columns)).describe()
| SqFt | Acreage | Beds | Baths | Latitude | Longitude | |
|---|---|---|---|---|---|---|
| count | 5584.000000 | 5584.000000 | 5584.000000 | 5584.000000 | 5584.000000 | 5584.000000 |
| mean | 0.033891 | 0.000685 | 0.312272 | 0.233887 | 0.447215 | 0.535967 |
| std | 0.018064 | 0.017063 | 0.122686 | 0.083752 | 0.201269 | 0.225972 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.024919 | 0.000000 | 0.285714 | 0.200000 | 0.290593 | 0.377416 |
| 50% | 0.031931 | 0.000000 | 0.285714 | 0.200000 | 0.446412 | 0.540556 |
| 75% | 0.040819 | 0.000263 | 0.428571 | 0.266667 | 0.588462 | 0.706783 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
#tune the model
outsample = []
k = list(range(1,50)) + list(range(50,501,50))
for i in range(len(k)):
knn_model = KNeighborsRegressor(n_neighbors=k[i])
cv_score = cross_validate(knn_model, X, y, cv=10, scoring='neg_mean_squared_error')['test_score']
outsample.append(np.mean(np.sqrt(-cv_score)))
knn_table = pd.DataFrame({'k':k, 'out-of-sample error':outsample})
sns.lineplot(data=knn_table, x='k', y='out-of-sample error')
<AxesSubplot:xlabel='k', ylabel='out-of-sample error'>
optimalk = k[outsample.index(min(outsample))]
print('The best k for out-of-sample prediction: ' + str(optimalk))
print('The best cv out-of-sample error: ' + str(round(min(outsample),3)))
The best k for out-of-sample prediction: 14 The best cv out-of-sample error: 321.491
#split the data
X = X_forest.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)
#tune the model
maxfeatures = [2,3,4,5,6]
samplesleaf = list(range(1,15)) + list(range(15,50,5))
bestmaxfeature = 99999
bestsamplesleaf = 99999
best_outsample = 99999
for i in tqdm(range(len(maxfeatures))):
for j in range(len(samplesleaf)):
rf_model = RandomForestRegressor(n_estimators=500,
max_features=maxfeatures[i],
min_samples_leaf=samplesleaf[j])
rf_model.fit(X_train, y_train)
#out-of-sample
test_preds = rf_model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_preds))
if rmse < best_outsample:
bestmaxfeature = maxfeatures[i]
bestsamplesleaf = samplesleaf[j]
best_outsample = rmse
print('The best max_features for out-of-sample prediction: ' + str(bestmaxfeature))
print('The best min_samples_leaf for out-of-sample prediction: ' + str(bestsamplesleaf))
print('The best out-of-sample error: ' + str(round(best_outsample,3)))
The best max_features for out-of-sample prediction: 6 The best min_samples_leaf for out-of-sample prediction: 3 The best out-of-sample error: 265.401
#computation time: 5 min.
#best max_features: 6
#best min_samples_leaf: 3
#best out-of-sample error: 267.38
xlist = list(np.linspace(min(X_forest['Latitude']), max(X_forest['Latitude']), 50))
ylist = list(np.linspace(min(X_forest['Longitude']), max(X_forest['Longitude']), 50))
temp_table = pd.DataFrame({'Latitude':xlist*50, 'Longitude':np.repeat(ylist,50)})
temp_table['SqFt'] = np.mean(X_forest['SqFt'])
temp_table['Acreage'] = np.mean(X_forest['Acreage'])
temp_table['Beds'] = np.mean(X_forest['Beds'])
temp_table['Baths'] = np.mean(X_forest['Baths'])
temp_table = temp_table.reindex(columns = list(X_forest.columns))
ada_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=10),
n_estimators=500,
learning_rate=0.05)
ada_model.fit(X_train, y_train)
temp_table['Price'] = ada_model.predict(temp_table)
temp_table
| SqFt | Acreage | Beds | Baths | Latitude | Longitude | Price | |
|---|---|---|---|---|---|---|---|
| 0 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 33.294761 | -85.04894 | 2127.300000 |
| 1 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 33.316622 | -85.04894 | 2127.300000 |
| 2 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 33.338482 | -85.04894 | 2127.300000 |
| 3 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 33.360342 | -85.04894 | 2124.870968 |
| 4 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 33.382202 | -85.04894 | 2117.704082 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2495 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 34.278468 | -83.78203 | 2158.795918 |
| 2496 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 34.300329 | -83.78203 | 2158.795918 |
| 2497 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 34.322189 | -83.78203 | 2158.795918 |
| 2498 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 34.344049 | -83.78203 | 2158.795918 |
| 2499 | 1767.42646 | 0.912957 | 3.185906 | 2.254151 | 34.365909 | -83.78203 | 2158.795918 |
2500 rows × 7 columns
fig = go.Figure(data =
go.Contour(
z=temp_table['Price'],
x=temp_table['Latitude'], # horizontal axis
y=temp_table['Longitude'] # vertical axis
))
fig.show()
#tune the model
base_model = list(range(1,16))
tree_num = [500,1000]
learning_rate = [0.001,0.005,0.01,0.05,0.1,0.5]
best_outsample = 99999
best_base_model = 99999
best_tree_num = 99999
best_learning_rate = 99999
for i in tqdm(range(len(base_model))):
for j in range(len(tree_num)):
for k in range(len(learning_rate)):
ada_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=base_model[i]),
n_estimators=tree_num[j],
learning_rate=learning_rate[k])
ada_model.fit(X_train, y_train)
#out-of-sample
test_preds = ada_model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_preds))
if rmse < best_outsample:
best_outsample = rmse
best_base_model = base_model[i]
best_tree_num = tree_num[j]
best_learning_rate = learning_rate[k]
print('The best base_model_depth for out-of-sample prediction: ' + str(best_base_model))
print('The best tree_num for out-of-sample prediction: ' + str(best_tree_num))
print('The best learning_rate for out-of-sample prediction: ' + str(best_learning_rate))
print('The best out-of-sample error: ' + str(round(best_outsample,3)))
#computation time: 24 min.
#best base_model_depth: 10
#best tree_num: 500
#best learning_rate: 0.05
#best out-of-sample error: 266.38
#tune the model
maxdepth = [1,2,3,4,5,6,7,8,9,10]
tree_num = [500,1000,1500]
learning_rate = [0.001,0.005,0.01,0.05,0.1,0.5]
best_outsample = 99999
best_maxdepth = 99999
best_tree_num = 99999
best_learning_rate = 99999
for i in tqdm(range(len(maxdepth))):
for j in range(len(tree_num)):
for k in range(len(learning_rate)):
grad_model = GradientBoostingRegressor(max_depth=maxdepth[i],
n_estimators=tree_num[j],
learning_rate=learning_rate[k])
grad_model.fit(X_train, y_train)
#out-of-sample
test_preds = grad_model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_preds))
if rmse < best_outsample:
best_outsample = rmse
best_maxdepth = maxdepth[i]
best_tree_num = tree_num[j]
best_learning_rate = learning_rate[k]
print('The best max_depth for out-of-sample prediction: ' + str(best_maxdepth))
print('The best tree_num for out-of-sample prediction: ' + str(best_tree_num))
print('The best learning_rate for out-of-sample prediction: ' + str(best_learning_rate))
print('The best out-of-sample error: ' + str(round(best_outsample,3)))
#computation time: 21 min.
#best max_depth: 6
#best tree_num: 1500
#best learning_rate: 0.005
#best out-of-sample error: 265.98
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
pd.DataFrame(X_train_scaled, columns=list(X_forest.columns)).describe()
pd.DataFrame(X_test_scaled, columns=list(X_forest.columns)).describe()
warnings.simplefilter('ignore')
#tune the model
hiddenlayer = [(10,),(20,),(30,),(40,),(50,),(60,),(70,),(80,),(90,),(100,),
(10,10,),(20,20,),(30,30,),(40,40,),(50,50,),]
activation = ['identity', 'logistic', 'tanh', 'relu']
best_outsample = 99999
best_hiddenlayer = 99999
best_activation = 99999
for i in tqdm(range(len(hiddenlayer))):
for j in range(len(activation)):
nn_model = MLPRegressor(hidden_layer_sizes=hiddenlayer[i],
activation=activation[j],
learning_rate_init=0.005,
max_iter=10000,
random_state=12345)
nn_model.fit(X_train_scaled, y_train)
#out-of-sample
test_preds = nn_model.predict(X_test_scaled)
rmse = sqrt(mean_squared_error(y_test, test_preds))
if rmse < best_outsample:
best_outsample = rmse
best_activation = activation[j]
best_hiddenlayer = hiddenlayer[i]
print('The best hiddenlayer for out-of-sample prediction: ' + str(best_hiddenlayer))
print('The best activation for out-of-sample prediction: ' + str(best_activation))
print('The best out-of-sample error: ' + str(round(best_outsample,3)))
#computation time: 32 min.
#best hiddenlayer: (90,)
#best activation: 'tanh'
#best out-of-sample error: 271.31